{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9f1b7d31",
   "metadata": {},
   "source": [
    "## 1.导入包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bb4bc3a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import MinMaxScaler, OneHotEncoder\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.metrics import mean_squared_error\n",
    "import matplotlib.pyplot as plt\n",
    "from xgboost import plot_tree\n",
    "from matplotlib.pylab import rcParams\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import KFold\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1a7101b",
   "metadata": {},
   "source": [
    "## 2.读取文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a1ae5e62",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\anaconda3\\envs\\pytorch\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3441: DtypeWarning: Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "store_df = pd.read_csv(\"F:/学习/数学建模/Rossmann Store/store.csv\")\n",
    "train_df = pd.read_csv(\"F:/学习/数学建模/Rossmann Store/train.csv\")           \n",
    "test_df = pd.read_csv(\"F:/学习/数学建模/Rossmann Store/test.csv\")\n",
    "submission_df = pd.read_csv(\"F:/学习/数学建模/Rossmann Store/sample_submission.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc58590a",
   "metadata": {},
   "source": [
    "## 3.数据展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "929a8624",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Store</th>\n",
       "      <th>DayOfWeek</th>\n",
       "      <th>Date</th>\n",
       "      <th>Sales</th>\n",
       "      <th>Customers</th>\n",
       "      <th>Open</th>\n",
       "      <th>Promo</th>\n",
       "      <th>StateHoliday</th>\n",
       "      <th>SchoolHoliday</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>5263</td>\n",
       "      <td>555</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>6064</td>\n",
       "      <td>625</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>8314</td>\n",
       "      <td>821</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>13995</td>\n",
       "      <td>1498</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>4822</td>\n",
       "      <td>559</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017204</th>\n",
       "      <td>1111</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017205</th>\n",
       "      <td>1112</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017206</th>\n",
       "      <td>1113</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017207</th>\n",
       "      <td>1114</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017208</th>\n",
       "      <td>1115</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1017209 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Store  DayOfWeek        Date  Sales  Customers  Open  Promo  \\\n",
       "0            1          5  2015-07-31   5263        555     1      1   \n",
       "1            2          5  2015-07-31   6064        625     1      1   \n",
       "2            3          5  2015-07-31   8314        821     1      1   \n",
       "3            4          5  2015-07-31  13995       1498     1      1   \n",
       "4            5          5  2015-07-31   4822        559     1      1   \n",
       "...        ...        ...         ...    ...        ...   ...    ...   \n",
       "1017204   1111          2  2013-01-01      0          0     0      0   \n",
       "1017205   1112          2  2013-01-01      0          0     0      0   \n",
       "1017206   1113          2  2013-01-01      0          0     0      0   \n",
       "1017207   1114          2  2013-01-01      0          0     0      0   \n",
       "1017208   1115          2  2013-01-01      0          0     0      0   \n",
       "\n",
       "        StateHoliday  SchoolHoliday  \n",
       "0                  0              1  \n",
       "1                  0              1  \n",
       "2                  0              1  \n",
       "3                  0              1  \n",
       "4                  0              1  \n",
       "...              ...            ...  \n",
       "1017204            a              1  \n",
       "1017205            a              1  \n",
       "1017206            a              1  \n",
       "1017207            a              1  \n",
       "1017208            a              1  \n",
       "\n",
       "[1017209 rows x 9 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "64cdca95",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Store</th>\n",
       "      <th>DayOfWeek</th>\n",
       "      <th>Date</th>\n",
       "      <th>Open</th>\n",
       "      <th>Promo</th>\n",
       "      <th>StateHoliday</th>\n",
       "      <th>SchoolHoliday</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-09-17</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-09-17</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-09-17</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-09-17</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>2015-09-17</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41083</th>\n",
       "      <td>41084</td>\n",
       "      <td>1111</td>\n",
       "      <td>6</td>\n",
       "      <td>2015-08-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41084</th>\n",
       "      <td>41085</td>\n",
       "      <td>1112</td>\n",
       "      <td>6</td>\n",
       "      <td>2015-08-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41085</th>\n",
       "      <td>41086</td>\n",
       "      <td>1113</td>\n",
       "      <td>6</td>\n",
       "      <td>2015-08-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41086</th>\n",
       "      <td>41087</td>\n",
       "      <td>1114</td>\n",
       "      <td>6</td>\n",
       "      <td>2015-08-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41087</th>\n",
       "      <td>41088</td>\n",
       "      <td>1115</td>\n",
       "      <td>6</td>\n",
       "      <td>2015-08-01</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41088 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  \\\n",
       "0          1      1          4  2015-09-17   1.0      1            0   \n",
       "1          2      3          4  2015-09-17   1.0      1            0   \n",
       "2          3      7          4  2015-09-17   1.0      1            0   \n",
       "3          4      8          4  2015-09-17   1.0      1            0   \n",
       "4          5      9          4  2015-09-17   1.0      1            0   \n",
       "...      ...    ...        ...         ...   ...    ...          ...   \n",
       "41083  41084   1111          6  2015-08-01   1.0      0            0   \n",
       "41084  41085   1112          6  2015-08-01   1.0      0            0   \n",
       "41085  41086   1113          6  2015-08-01   1.0      0            0   \n",
       "41086  41087   1114          6  2015-08-01   1.0      0            0   \n",
       "41087  41088   1115          6  2015-08-01   1.0      0            0   \n",
       "\n",
       "       SchoolHoliday  \n",
       "0                  0  \n",
       "1                  0  \n",
       "2                  0  \n",
       "3                  0  \n",
       "4                  0  \n",
       "...              ...  \n",
       "41083              0  \n",
       "41084              0  \n",
       "41085              0  \n",
       "41086              0  \n",
       "41087              1  \n",
       "\n",
       "[41088 rows x 8 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fe93fcdf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Store</th>\n",
       "      <th>StoreType</th>\n",
       "      <th>Assortment</th>\n",
       "      <th>CompetitionDistance</th>\n",
       "      <th>CompetitionOpenSinceMonth</th>\n",
       "      <th>CompetitionOpenSinceYear</th>\n",
       "      <th>Promo2</th>\n",
       "      <th>Promo2SinceWeek</th>\n",
       "      <th>Promo2SinceYear</th>\n",
       "      <th>PromoInterval</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>1270.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>570.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>1</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2010.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>14130.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>1</td>\n",
       "      <td>14.0</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>c</td>\n",
       "      <td>c</td>\n",
       "      <td>620.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>29910.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1110</th>\n",
       "      <td>1111</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>1900.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2014.0</td>\n",
       "      <td>1</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2013.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1111</th>\n",
       "      <td>1112</td>\n",
       "      <td>c</td>\n",
       "      <td>c</td>\n",
       "      <td>1880.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112</th>\n",
       "      <td>1113</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>9260.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1113</th>\n",
       "      <td>1114</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>870.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1114</th>\n",
       "      <td>1115</td>\n",
       "      <td>d</td>\n",
       "      <td>c</td>\n",
       "      <td>5350.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>Mar,Jun,Sept,Dec</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1115 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Store StoreType Assortment  CompetitionDistance  \\\n",
       "0         1         c          a               1270.0   \n",
       "1         2         a          a                570.0   \n",
       "2         3         a          a              14130.0   \n",
       "3         4         c          c                620.0   \n",
       "4         5         a          a              29910.0   \n",
       "...     ...       ...        ...                  ...   \n",
       "1110   1111         a          a               1900.0   \n",
       "1111   1112         c          c               1880.0   \n",
       "1112   1113         a          c               9260.0   \n",
       "1113   1114         a          c                870.0   \n",
       "1114   1115         d          c               5350.0   \n",
       "\n",
       "      CompetitionOpenSinceMonth  CompetitionOpenSinceYear  Promo2  \\\n",
       "0                           9.0                    2008.0       0   \n",
       "1                          11.0                    2007.0       1   \n",
       "2                          12.0                    2006.0       1   \n",
       "3                           9.0                    2009.0       0   \n",
       "4                           4.0                    2015.0       0   \n",
       "...                         ...                       ...     ...   \n",
       "1110                        6.0                    2014.0       1   \n",
       "1111                        4.0                    2006.0       0   \n",
       "1112                        NaN                       NaN       0   \n",
       "1113                        NaN                       NaN       0   \n",
       "1114                        NaN                       NaN       1   \n",
       "\n",
       "      Promo2SinceWeek  Promo2SinceYear     PromoInterval  \n",
       "0                 NaN              NaN               NaN  \n",
       "1                13.0           2010.0   Jan,Apr,Jul,Oct  \n",
       "2                14.0           2011.0   Jan,Apr,Jul,Oct  \n",
       "3                 NaN              NaN               NaN  \n",
       "4                 NaN              NaN               NaN  \n",
       "...               ...              ...               ...  \n",
       "1110             31.0           2013.0   Jan,Apr,Jul,Oct  \n",
       "1111              NaN              NaN               NaN  \n",
       "1112              NaN              NaN               NaN  \n",
       "1113              NaN              NaN               NaN  \n",
       "1114             22.0           2012.0  Mar,Jun,Sept,Dec  \n",
       "\n",
       "[1115 rows x 10 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "store_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d7add18",
   "metadata": {},
   "source": [
    "## 4.数据合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1fcbe249",
   "metadata": {},
   "outputs": [],
   "source": [
    "##通过分析数据可知，商店的详细信息在store_df中，需要将商店的详细信息根据商店的编号与训练集和测试集合并，以增加训练集和测试集的特征\n",
    "## Store 为商店编号，向左合并\n",
    "merge_train_df = train_df.merge(store_df, how=\"left\",on=\"Store\")\n",
    "merge_test_df = test_df.merge(store_df, how=\"left\",on=\"Store\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7706d8c0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Store</th>\n",
       "      <th>DayOfWeek</th>\n",
       "      <th>Date</th>\n",
       "      <th>Sales</th>\n",
       "      <th>Customers</th>\n",
       "      <th>Open</th>\n",
       "      <th>Promo</th>\n",
       "      <th>StateHoliday</th>\n",
       "      <th>SchoolHoliday</th>\n",
       "      <th>StoreType</th>\n",
       "      <th>Assortment</th>\n",
       "      <th>CompetitionDistance</th>\n",
       "      <th>CompetitionOpenSinceMonth</th>\n",
       "      <th>CompetitionOpenSinceYear</th>\n",
       "      <th>Promo2</th>\n",
       "      <th>Promo2SinceWeek</th>\n",
       "      <th>Promo2SinceYear</th>\n",
       "      <th>PromoInterval</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>5263</td>\n",
       "      <td>555</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>a</td>\n",
       "      <td>1270.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>6064</td>\n",
       "      <td>625</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>570.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>1</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2010.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>8314</td>\n",
       "      <td>821</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>14130.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>1</td>\n",
       "      <td>14.0</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>13995</td>\n",
       "      <td>1498</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>c</td>\n",
       "      <td>620.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>4822</td>\n",
       "      <td>559</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>29910.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017204</th>\n",
       "      <td>1111</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>a</td>\n",
       "      <td>1900.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2014.0</td>\n",
       "      <td>1</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2013.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017205</th>\n",
       "      <td>1112</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>c</td>\n",
       "      <td>1880.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017206</th>\n",
       "      <td>1113</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>9260.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017207</th>\n",
       "      <td>1114</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>870.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017208</th>\n",
       "      <td>1115</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>d</td>\n",
       "      <td>c</td>\n",
       "      <td>5350.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>Mar,Jun,Sept,Dec</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1017209 rows × 18 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Store  DayOfWeek        Date  Sales  Customers  Open  Promo  \\\n",
       "0            1          5  2015-07-31   5263        555     1      1   \n",
       "1            2          5  2015-07-31   6064        625     1      1   \n",
       "2            3          5  2015-07-31   8314        821     1      1   \n",
       "3            4          5  2015-07-31  13995       1498     1      1   \n",
       "4            5          5  2015-07-31   4822        559     1      1   \n",
       "...        ...        ...         ...    ...        ...   ...    ...   \n",
       "1017204   1111          2  2013-01-01      0          0     0      0   \n",
       "1017205   1112          2  2013-01-01      0          0     0      0   \n",
       "1017206   1113          2  2013-01-01      0          0     0      0   \n",
       "1017207   1114          2  2013-01-01      0          0     0      0   \n",
       "1017208   1115          2  2013-01-01      0          0     0      0   \n",
       "\n",
       "        StateHoliday  SchoolHoliday StoreType Assortment  CompetitionDistance  \\\n",
       "0                  0              1         c          a               1270.0   \n",
       "1                  0              1         a          a                570.0   \n",
       "2                  0              1         a          a              14130.0   \n",
       "3                  0              1         c          c                620.0   \n",
       "4                  0              1         a          a              29910.0   \n",
       "...              ...            ...       ...        ...                  ...   \n",
       "1017204            a              1         a          a               1900.0   \n",
       "1017205            a              1         c          c               1880.0   \n",
       "1017206            a              1         a          c               9260.0   \n",
       "1017207            a              1         a          c                870.0   \n",
       "1017208            a              1         d          c               5350.0   \n",
       "\n",
       "         CompetitionOpenSinceMonth  CompetitionOpenSinceYear  Promo2  \\\n",
       "0                              9.0                    2008.0       0   \n",
       "1                             11.0                    2007.0       1   \n",
       "2                             12.0                    2006.0       1   \n",
       "3                              9.0                    2009.0       0   \n",
       "4                              4.0                    2015.0       0   \n",
       "...                            ...                       ...     ...   \n",
       "1017204                        6.0                    2014.0       1   \n",
       "1017205                        4.0                    2006.0       0   \n",
       "1017206                        NaN                       NaN       0   \n",
       "1017207                        NaN                       NaN       0   \n",
       "1017208                        NaN                       NaN       1   \n",
       "\n",
       "         Promo2SinceWeek  Promo2SinceYear     PromoInterval  \n",
       "0                    NaN              NaN               NaN  \n",
       "1                   13.0           2010.0   Jan,Apr,Jul,Oct  \n",
       "2                   14.0           2011.0   Jan,Apr,Jul,Oct  \n",
       "3                    NaN              NaN               NaN  \n",
       "4                    NaN              NaN               NaN  \n",
       "...                  ...              ...               ...  \n",
       "1017204             31.0           2013.0   Jan,Apr,Jul,Oct  \n",
       "1017205              NaN              NaN               NaN  \n",
       "1017206              NaN              NaN               NaN  \n",
       "1017207              NaN              NaN               NaN  \n",
       "1017208             22.0           2012.0  Mar,Jun,Sept,Dec  \n",
       "\n",
       "[1017209 rows x 18 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_train_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2d55503",
   "metadata": {},
   "source": [
    "## 5.数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8b994ccf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1017209 entries, 0 to 1017208\n",
      "Data columns (total 18 columns):\n",
      " #   Column                     Non-Null Count    Dtype  \n",
      "---  ------                     --------------    -----  \n",
      " 0   Store                      1017209 non-null  int64  \n",
      " 1   DayOfWeek                  1017209 non-null  int64  \n",
      " 2   Date                       1017209 non-null  object \n",
      " 3   Sales                      1017209 non-null  int64  \n",
      " 4   Customers                  1017209 non-null  int64  \n",
      " 5   Open                       1017209 non-null  int64  \n",
      " 6   Promo                      1017209 non-null  int64  \n",
      " 7   StateHoliday               1017209 non-null  object \n",
      " 8   SchoolHoliday              1017209 non-null  int64  \n",
      " 9   StoreType                  1017209 non-null  object \n",
      " 10  Assortment                 1017209 non-null  object \n",
      " 11  CompetitionDistance        1014567 non-null  float64\n",
      " 12  CompetitionOpenSinceMonth  693861 non-null   float64\n",
      " 13  CompetitionOpenSinceYear   693861 non-null   float64\n",
      " 14  Promo2                     1017209 non-null  int64  \n",
      " 15  Promo2SinceWeek            509178 non-null   float64\n",
      " 16  Promo2SinceYear            509178 non-null   float64\n",
      " 17  PromoInterval              509178 non-null   object \n",
      "dtypes: float64(5), int64(8), object(5)\n",
      "memory usage: 147.5+ MB\n"
     ]
    }
   ],
   "source": [
    "merge_train_df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf6f964d",
   "metadata": {},
   "source": [
    "### 5.1 提取时间信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cbcd5179",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_time(df):\n",
    "    df['Date'] = pd.to_datetime(df['Date'])\n",
    "    df['Year'] = df['Date'].dt.year\n",
    "    df['Month'] = df[\"Date\"].dt.month\n",
    "    df['Day'] = df['Date'].dt.day\n",
    "    df['WeekOfYear'] = df['Date'].dt.isocalendar().week"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d4de1db9",
   "metadata": {},
   "outputs": [],
   "source": [
    "extract_time(merge_train_df)\n",
    "extract_time(merge_test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0326ff80",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Store</th>\n",
       "      <th>DayOfWeek</th>\n",
       "      <th>Date</th>\n",
       "      <th>Sales</th>\n",
       "      <th>Customers</th>\n",
       "      <th>Open</th>\n",
       "      <th>Promo</th>\n",
       "      <th>StateHoliday</th>\n",
       "      <th>SchoolHoliday</th>\n",
       "      <th>StoreType</th>\n",
       "      <th>...</th>\n",
       "      <th>CompetitionOpenSinceMonth</th>\n",
       "      <th>CompetitionOpenSinceYear</th>\n",
       "      <th>Promo2</th>\n",
       "      <th>Promo2SinceWeek</th>\n",
       "      <th>Promo2SinceYear</th>\n",
       "      <th>PromoInterval</th>\n",
       "      <th>Year</th>\n",
       "      <th>Month</th>\n",
       "      <th>Day</th>\n",
       "      <th>WeekOfYear</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>5263</td>\n",
       "      <td>555</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2015</td>\n",
       "      <td>7</td>\n",
       "      <td>31</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>6064</td>\n",
       "      <td>625</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>...</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>1</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2010.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "      <td>2015</td>\n",
       "      <td>7</td>\n",
       "      <td>31</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>8314</td>\n",
       "      <td>821</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>...</td>\n",
       "      <td>12.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>1</td>\n",
       "      <td>14.0</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "      <td>2015</td>\n",
       "      <td>7</td>\n",
       "      <td>31</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>13995</td>\n",
       "      <td>1498</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2015</td>\n",
       "      <td>7</td>\n",
       "      <td>31</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>4822</td>\n",
       "      <td>559</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2015</td>\n",
       "      <td>7</td>\n",
       "      <td>31</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017204</th>\n",
       "      <td>1111</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>...</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2014.0</td>\n",
       "      <td>1</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2013.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017205</th>\n",
       "      <td>1112</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "      <td>...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017206</th>\n",
       "      <td>1113</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017207</th>\n",
       "      <td>1114</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017208</th>\n",
       "      <td>1115</td>\n",
       "      <td>2</td>\n",
       "      <td>2013-01-01</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>d</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>Mar,Jun,Sept,Dec</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1017209 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Store  DayOfWeek       Date  Sales  Customers  Open  Promo  \\\n",
       "0            1          5 2015-07-31   5263        555     1      1   \n",
       "1            2          5 2015-07-31   6064        625     1      1   \n",
       "2            3          5 2015-07-31   8314        821     1      1   \n",
       "3            4          5 2015-07-31  13995       1498     1      1   \n",
       "4            5          5 2015-07-31   4822        559     1      1   \n",
       "...        ...        ...        ...    ...        ...   ...    ...   \n",
       "1017204   1111          2 2013-01-01      0          0     0      0   \n",
       "1017205   1112          2 2013-01-01      0          0     0      0   \n",
       "1017206   1113          2 2013-01-01      0          0     0      0   \n",
       "1017207   1114          2 2013-01-01      0          0     0      0   \n",
       "1017208   1115          2 2013-01-01      0          0     0      0   \n",
       "\n",
       "        StateHoliday  SchoolHoliday StoreType  ... CompetitionOpenSinceMonth  \\\n",
       "0                  0              1         c  ...                       9.0   \n",
       "1                  0              1         a  ...                      11.0   \n",
       "2                  0              1         a  ...                      12.0   \n",
       "3                  0              1         c  ...                       9.0   \n",
       "4                  0              1         a  ...                       4.0   \n",
       "...              ...            ...       ...  ...                       ...   \n",
       "1017204            a              1         a  ...                       6.0   \n",
       "1017205            a              1         c  ...                       4.0   \n",
       "1017206            a              1         a  ...                       NaN   \n",
       "1017207            a              1         a  ...                       NaN   \n",
       "1017208            a              1         d  ...                       NaN   \n",
       "\n",
       "         CompetitionOpenSinceYear  Promo2  Promo2SinceWeek  Promo2SinceYear  \\\n",
       "0                          2008.0       0              NaN              NaN   \n",
       "1                          2007.0       1             13.0           2010.0   \n",
       "2                          2006.0       1             14.0           2011.0   \n",
       "3                          2009.0       0              NaN              NaN   \n",
       "4                          2015.0       0              NaN              NaN   \n",
       "...                           ...     ...              ...              ...   \n",
       "1017204                    2014.0       1             31.0           2013.0   \n",
       "1017205                    2006.0       0              NaN              NaN   \n",
       "1017206                       NaN       0              NaN              NaN   \n",
       "1017207                       NaN       0              NaN              NaN   \n",
       "1017208                       NaN       1             22.0           2012.0   \n",
       "\n",
       "            PromoInterval  Year Month  Day  WeekOfYear  \n",
       "0                     NaN  2015     7   31          31  \n",
       "1         Jan,Apr,Jul,Oct  2015     7   31          31  \n",
       "2         Jan,Apr,Jul,Oct  2015     7   31          31  \n",
       "3                     NaN  2015     7   31          31  \n",
       "4                     NaN  2015     7   31          31  \n",
       "...                   ...   ...   ...  ...         ...  \n",
       "1017204   Jan,Apr,Jul,Oct  2013     1    1           1  \n",
       "1017205               NaN  2013     1    1           1  \n",
       "1017206               NaN  2013     1    1           1  \n",
       "1017207               NaN  2013     1    1           1  \n",
       "1017208  Mar,Jun,Sept,Dec  2013     1    1           1  \n",
       "\n",
       "[1017209 rows x 22 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "857ed491",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    844392\n",
       "0    172817\n",
       "Name: Open, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#删除训练集中商店关门的日子\n",
    "merge_train_df.Open.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bc427f27",
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_train_df = merge_train_df[merge_train_df['Open']==1].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c1715701",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 844392 entries, 0 to 1017190\n",
      "Data columns (total 22 columns):\n",
      " #   Column                     Non-Null Count   Dtype         \n",
      "---  ------                     --------------   -----         \n",
      " 0   Store                      844392 non-null  int64         \n",
      " 1   DayOfWeek                  844392 non-null  int64         \n",
      " 2   Date                       844392 non-null  datetime64[ns]\n",
      " 3   Sales                      844392 non-null  int64         \n",
      " 4   Customers                  844392 non-null  int64         \n",
      " 5   Open                       844392 non-null  int64         \n",
      " 6   Promo                      844392 non-null  int64         \n",
      " 7   StateHoliday               844392 non-null  object        \n",
      " 8   SchoolHoliday              844392 non-null  int64         \n",
      " 9   StoreType                  844392 non-null  object        \n",
      " 10  Assortment                 844392 non-null  object        \n",
      " 11  CompetitionDistance        842206 non-null  float64       \n",
      " 12  CompetitionOpenSinceMonth  575773 non-null  float64       \n",
      " 13  CompetitionOpenSinceYear   575773 non-null  float64       \n",
      " 14  Promo2                     844392 non-null  int64         \n",
      " 15  Promo2SinceWeek            421085 non-null  float64       \n",
      " 16  Promo2SinceYear            421085 non-null  float64       \n",
      " 17  PromoInterval              421085 non-null  object        \n",
      " 18  Year                       844392 non-null  int64         \n",
      " 19  Month                      844392 non-null  int64         \n",
      " 20  Day                        844392 non-null  int64         \n",
      " 21  WeekOfYear                 844392 non-null  UInt32        \n",
      "dtypes: UInt32(1), datetime64[ns](1), float64(5), int64(11), object(4)\n",
      "memory usage: 145.8+ MB\n"
     ]
    }
   ],
   "source": [
    "merge_train_df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cee9b84",
   "metadata": {},
   "source": [
    "### 5.2缺失值处理"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ffde850f",
   "metadata": {},
   "source": [
    "- 首先分析 CompetitionOpenSinceMonth 和 CompetitionOpenSinceYear 这两列数据，这两列的含义是出现竞争店铺的年份和月份，存在缺失值的原因是该商店附近不存在竞争店铺，可以用 0 填补。\n",
    "- 其次可以通过上一步求得的年份和月份创建一列新数据来展示竞争者出现了多久"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f0b4cd84",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cal_during_compitition(df):\n",
    "    df[\"CompetitionOpen\"] = 12*(df['Year']-df['CompetitionOpenSinceYear'])+(df['Month']-df['CompetitionOpenSinceMonth'])\n",
    "    df[\"CompetitionOpen\"] = df[\"CompetitionOpen\"].apply(lambda x:0 if x<0 else x).fillna(0)                                          "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "541cc8e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "cal_during_compitition(merge_train_df)\n",
    "cal_during_compitition(merge_test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c2960b00",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CompetitionOpenSinceMonth</th>\n",
       "      <th>CompetitionOpenSinceYear</th>\n",
       "      <th>CompetitionOpen</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.0</td>\n",
       "      <td>2008.0</td>\n",
       "      <td>82.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11.0</td>\n",
       "      <td>2007.0</td>\n",
       "      <td>92.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>103.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4.0</td>\n",
       "      <td>2015.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1016776</th>\n",
       "      <td>9.0</td>\n",
       "      <td>2006.0</td>\n",
       "      <td>76.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1016827</th>\n",
       "      <td>10.0</td>\n",
       "      <td>1999.0</td>\n",
       "      <td>159.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1016863</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017042</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017190</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2002.0</td>\n",
       "      <td>130.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>844392 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         CompetitionOpenSinceMonth  CompetitionOpenSinceYear  CompetitionOpen\n",
       "0                              9.0                    2008.0             82.0\n",
       "1                             11.0                    2007.0             92.0\n",
       "2                             12.0                    2006.0            103.0\n",
       "3                              9.0                    2009.0             70.0\n",
       "4                              4.0                    2015.0              3.0\n",
       "...                            ...                       ...              ...\n",
       "1016776                        9.0                    2006.0             76.0\n",
       "1016827                       10.0                    1999.0            159.0\n",
       "1016863                        NaN                       NaN              0.0\n",
       "1017042                        NaN                       NaN              0.0\n",
       "1017190                        3.0                    2002.0            130.0\n",
       "\n",
       "[844392 rows x 3 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_train_df[['CompetitionOpenSinceMonth','CompetitionOpenSinceYear','CompetitionOpen']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd62ba05",
   "metadata": {},
   "source": [
    "- 接下来分析 Promo2，Promo2SinceWeek，Promo2SinceYear，PromoInterval 这四列数据\n",
    " - Promo2表示该店铺是否有促销：0表示从未参加过促销，1表示参加过促销\n",
    " - Promo2SinceWeek/Promo2SinceYear表示该店铺 ××年、第×周开始参加促销\n",
    " - PromoInterval表示该店铺促销的时间间隔： \"Feb,May,Aug,Nov\"表示该店铺每年的1月，5月，8月，11月开始搞促销\n",
    "- 然后就可以创建两列数据分别表示促销开始了多久，当前的月份是否有搞促销"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c8296941",
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_promo_month(row):#检查当前月份是否有搞促销\n",
    "    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              \n",
    "                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\n",
    "    try:\n",
    "        months = (row['PromoInterval'] or '').split(',')\n",
    "        if(row['Promo2Open'] and month2str[row['Month']] in months):\n",
    "            return 1\n",
    "        else:\n",
    "            return 0\n",
    "    except:\n",
    "        return 0\n",
    "    \n",
    "def cal_during_promo(data): #计算促销持续了几个月\n",
    "    data['Promo2Open'] = 12 * (data.Year - data.Promo2SinceYear) + (data.WeekOfYear - data.Promo2SinceWeek)*7/30.5\n",
    "    data['Promo2Open'] = data['Promo2Open'].apply(lambda x: 0 if x < 0 else x).fillna(0)*data['Promo2']#only when there is promo\n",
    "    #创建新列表示当前月份是否有搞促销\n",
    "    data['IsPromo2Month'] = data.apply(check_promo_month, axis=1) * data['Promo2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "819a4ce2",
   "metadata": {},
   "outputs": [],
   "source": [
    "cal_during_promo(merge_train_df)\n",
    "cal_during_promo(merge_test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "6d9ce404",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Promo2</th>\n",
       "      <th>Promo2SinceWeek</th>\n",
       "      <th>Promo2SinceYear</th>\n",
       "      <th>PromoInterval</th>\n",
       "      <th>Promo2Open</th>\n",
       "      <th>IsPromo2Month</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>13.0</td>\n",
       "      <td>2010.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "      <td>64.131148</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>14.0</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "      <td>51.901639</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1016776</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1016827</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1016863</th>\n",
       "      <td>1</td>\n",
       "      <td>48.0</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>Jan,Apr,Jul,Oct</td>\n",
       "      <td>1.213115</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017042</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1017190</th>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>844392 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Promo2  Promo2SinceWeek  Promo2SinceYear    PromoInterval  \\\n",
       "0             0              NaN              NaN              NaN   \n",
       "1             1             13.0           2010.0  Jan,Apr,Jul,Oct   \n",
       "2             1             14.0           2011.0  Jan,Apr,Jul,Oct   \n",
       "3             0              NaN              NaN              NaN   \n",
       "4             0              NaN              NaN              NaN   \n",
       "...         ...              ...              ...              ...   \n",
       "1016776       0              NaN              NaN              NaN   \n",
       "1016827       0              NaN              NaN              NaN   \n",
       "1016863       1             48.0           2012.0  Jan,Apr,Jul,Oct   \n",
       "1017042       0              NaN              NaN              NaN   \n",
       "1017190       0              NaN              NaN              NaN   \n",
       "\n",
       "         Promo2Open  IsPromo2Month  \n",
       "0          0.000000              0  \n",
       "1         64.131148              1  \n",
       "2         51.901639              1  \n",
       "3          0.000000              0  \n",
       "4          0.000000              0  \n",
       "...             ...            ...  \n",
       "1016776    0.000000              0  \n",
       "1016827    0.000000              0  \n",
       "1016863    1.213115              1  \n",
       "1017042    0.000000              0  \n",
       "1017190    0.000000              0  \n",
       "\n",
       "[844392 rows x 6 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_train_df[['Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Promo2Open','IsPromo2Month']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "150b12c7",
   "metadata": {},
   "source": [
    "## 6.特征提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "6727ffe8",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', \n",
    "              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', \n",
    "              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', \n",
    "              'Promo2Open', 'IsPromo2Month']\n",
    "target_col = 'Sales'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "ca75118a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 844392 entries, 0 to 1017190\n",
      "Data columns (total 25 columns):\n",
      " #   Column                     Non-Null Count   Dtype         \n",
      "---  ------                     --------------   -----         \n",
      " 0   Store                      844392 non-null  int64         \n",
      " 1   DayOfWeek                  844392 non-null  int64         \n",
      " 2   Date                       844392 non-null  datetime64[ns]\n",
      " 3   Sales                      844392 non-null  int64         \n",
      " 4   Customers                  844392 non-null  int64         \n",
      " 5   Open                       844392 non-null  int64         \n",
      " 6   Promo                      844392 non-null  int64         \n",
      " 7   StateHoliday               844392 non-null  object        \n",
      " 8   SchoolHoliday              844392 non-null  int64         \n",
      " 9   StoreType                  844392 non-null  object        \n",
      " 10  Assortment                 844392 non-null  object        \n",
      " 11  CompetitionDistance        842206 non-null  float64       \n",
      " 12  CompetitionOpenSinceMonth  575773 non-null  float64       \n",
      " 13  CompetitionOpenSinceYear   575773 non-null  float64       \n",
      " 14  Promo2                     844392 non-null  int64         \n",
      " 15  Promo2SinceWeek            421085 non-null  float64       \n",
      " 16  Promo2SinceYear            421085 non-null  float64       \n",
      " 17  PromoInterval              421085 non-null  object        \n",
      " 18  Year                       844392 non-null  int64         \n",
      " 19  Month                      844392 non-null  int64         \n",
      " 20  Day                        844392 non-null  int64         \n",
      " 21  WeekOfYear                 844392 non-null  UInt32        \n",
      " 22  CompetitionOpen            844392 non-null  float64       \n",
      " 23  Promo2Open                 844392 non-null  float64       \n",
      " 24  IsPromo2Month              844392 non-null  int64         \n",
      "dtypes: UInt32(1), datetime64[ns](1), float64(7), int64(12), object(4)\n",
      "memory usage: 165.1+ MB\n"
     ]
    }
   ],
   "source": [
    "merge_train_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "ab956d47",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_inputs = merge_train_df[input_cols].copy()\n",
    "targets = merge_train_df[target_col].copy()\n",
    "test_inputs = merge_test_df[input_cols].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "57247311",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 844392 entries, 0 to 1017190\n",
      "Data columns (total 16 columns):\n",
      " #   Column               Non-Null Count   Dtype  \n",
      "---  ------               --------------   -----  \n",
      " 0   Store                844392 non-null  int64  \n",
      " 1   DayOfWeek            844392 non-null  int64  \n",
      " 2   Promo                844392 non-null  int64  \n",
      " 3   StateHoliday         844392 non-null  object \n",
      " 4   SchoolHoliday        844392 non-null  int64  \n",
      " 5   StoreType            844392 non-null  object \n",
      " 6   Assortment           844392 non-null  object \n",
      " 7   CompetitionDistance  842206 non-null  float64\n",
      " 8   CompetitionOpen      844392 non-null  float64\n",
      " 9   Day                  844392 non-null  int64  \n",
      " 10  Month                844392 non-null  int64  \n",
      " 11  Year                 844392 non-null  int64  \n",
      " 12  WeekOfYear           844392 non-null  UInt32 \n",
      " 13  Promo2               844392 non-null  int64  \n",
      " 14  Promo2Open           844392 non-null  float64\n",
      " 15  IsPromo2Month        844392 non-null  int64  \n",
      "dtypes: UInt32(1), float64(3), int64(9), object(3)\n",
      "memory usage: 107.1+ MB\n"
     ]
    }
   ],
   "source": [
    "train_inputs.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "f24e6d0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "##划分数值特征和分类特征\n",
    "numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',\n",
    "              'Day', 'Month', 'Year', 'WeekOfYear',  ]\n",
    "categorical_cols = ['StateHoliday', 'StoreType', 'Assortment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "f6afdba8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Store                     0\n",
       "Promo                     0\n",
       "SchoolHoliday             0\n",
       "CompetitionDistance    2186\n",
       "CompetitionOpen           0\n",
       "Promo2                    0\n",
       "Promo2Open                0\n",
       "IsPromo2Month             0\n",
       "Day                       0\n",
       "Month                     0\n",
       "Year                      0\n",
       "WeekOfYear                0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_inputs[numeric_cols].isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "35fabd22",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Store                   0\n",
       "Promo                   0\n",
       "SchoolHoliday           0\n",
       "CompetitionDistance    96\n",
       "CompetitionOpen         0\n",
       "Promo2                  0\n",
       "Promo2Open              0\n",
       "IsPromo2Month           0\n",
       "Day                     0\n",
       "Month                   0\n",
       "Year                    0\n",
       "WeekOfYear              0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_inputs[numeric_cols].isna().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ccc1ff6",
   "metadata": {},
   "source": [
    "- 采用最大距离插补缺失距离"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "14065085",
   "metadata": {},
   "outputs": [],
   "source": [
    "max_distance = train_inputs.CompetitionDistance.max()\n",
    "train_inputs['CompetitionDistance'].fillna(max_distance, inplace=True)\n",
    "test_inputs['CompetitionDistance'].fillna(max_distance, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ba606a3",
   "metadata": {},
   "source": [
    "- 归一化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "16234c54",
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = MinMaxScaler().fit(train_inputs[numeric_cols])\n",
    "train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])\n",
    "test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "01359058",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Store                  0\n",
       "Promo                  0\n",
       "SchoolHoliday          0\n",
       "CompetitionDistance    0\n",
       "CompetitionOpen        0\n",
       "Promo2                 0\n",
       "Promo2Open             0\n",
       "IsPromo2Month          0\n",
       "Day                    0\n",
       "Month                  0\n",
       "Year                   0\n",
       "WeekOfYear             0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_inputs[numeric_cols].isna().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d12bec1",
   "metadata": {},
   "source": [
    "- 独热编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "a1529648",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_inputs[categorical_cols] = train_inputs[categorical_cols].astype('str')\n",
    "encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(train_inputs[categorical_cols])\n",
    "encoded_cols = list(encoder.get_feature_names(categorical_cols))\n",
    "train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])\n",
    "test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])\n",
    "# train_inputs[categorical_cols].info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "aadc3b12",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = train_inputs[numeric_cols + encoded_cols]\n",
    "X_test = test_inputs[numeric_cols + encoded_cols]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa2106ab",
   "metadata": {},
   "source": [
    "## 7.XGBoost回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "efdac19a",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "ae3c6c1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 3.61 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n",
       "             importance_type='gain', interaction_constraints='',\n",
       "             learning_rate=0.300000012, max_delta_step=0, max_depth=4,\n",
       "             min_child_weight=1, missing=nan, monotone_constraints='()',\n",
       "             n_estimators=20, n_jobs=-1, num_parallel_tree=1, random_state=42,\n",
       "             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n",
       "             tree_method='exact', validate_parameters=1, verbosity=None)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.fit(X, targets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "2b1dae95",
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = model.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "ed426d8b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2436.070232904147"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def rmse(a, b):\n",
    "    return mean_squared_error(a, b, squared=False)\n",
    "rmse(preds, targets)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45c8093e",
   "metadata": {},
   "source": [
    "- 特征重要性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "1eaf9618",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feature</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Promo</td>\n",
       "      <td>0.291977</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>StoreType_b</td>\n",
       "      <td>0.103151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Promo2</td>\n",
       "      <td>0.096431</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Assortment_a</td>\n",
       "      <td>0.058846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CompetitionDistance</td>\n",
       "      <td>0.051998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Assortment_c</td>\n",
       "      <td>0.049951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Store</td>\n",
       "      <td>0.044921</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Day</td>\n",
       "      <td>0.039690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Month</td>\n",
       "      <td>0.036663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>StoreType_d</td>\n",
       "      <td>0.036597</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                feature  importance\n",
       "1                 Promo    0.291977\n",
       "17          StoreType_b    0.103151\n",
       "5                Promo2    0.096431\n",
       "20         Assortment_a    0.058846\n",
       "3   CompetitionDistance    0.051998\n",
       "22         Assortment_c    0.049951\n",
       "0                 Store    0.044921\n",
       "8                   Day    0.039690\n",
       "9                 Month    0.036663\n",
       "19          StoreType_d    0.036597"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "importance_df = pd.DataFrame({\n",
    "    'feature': X.columns,\n",
    "    'importance': model.feature_importances_\n",
    "}).sort_values('importance', ascending=False)\n",
    "importance_df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "9b7f8b04",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAroAAAGDCAYAAADEYLPhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAvTElEQVR4nO3de5xdZX3v8c+XAEIgEARRRCCCXLQIEQIUkavWXrSVnqIYkYtaKQpY7fHGwSrScorlWKtYihTl5g0FoYpasArBG5dEAwHlIiCKoIAgdyJJfuePvVI2k5nJHiYze2bN5/16zSt7r8vz/NaTLX7zzLPWTlUhSZIktc1q/S5AkiRJGgsGXUmSJLWSQVeSJEmtZNCVJElSKxl0JUmS1EoGXUmSJLWSQVeSJEmtZNCVpD5L8vMkjyV5uOvnuaugzVesqhp76O+4JJ8dr/6Gk+SwJN/rdx2S+s+gK0kTw59X1bpdP3f2s5gkq/ez/6drstYtaWwYdCVpgkqyfpJPJ7krya+S/GOSac2+rZJ8J8lvk9yb5HNJZjb7zgE2B77WzA6/N8k+Se4Y0P7/zPo2M7LnJflskgeBw4brv4faK8nbk9yc5KEk/9DU/MMkDyb5UpI1m2P3SXJHkv/TXMvPkxw0YBzOTnJPktuTfCDJas2+w5J8P8nHktwHnAucCuzeXPvvmuNeleTHTd+/THJcV/uzmnoPTfKLpoZju/ZPa2q7pbmWBUk2a/Ztl+RbSe5LcmOS143oL1nSmDLoStLEdRawBHgB8BLglcBfN/sC/BPwXOCFwGbAcQBVdTDwC56cJf7nHvt7DXAeMBP43Er678WfADsDfwi8FzgNOKipdXtgbtexzwE2AjYFDgVOS7Jts+9kYH1gS2Bv4BDgTV3n7gbcCmwMvBE4Avhhc+0zm2Meac6bCbwKeFuS/QfU+zJgW+DlwAeTvLDZ/ndNrX8GrAe8GXg0yTrAt4DPN33PBU5J8ge9D5GksWTQlaSJ4cIkv2t+LkzybOBPgXdW1SNVdTfwMeD1AFX1s6r6VlUtrqp7gH+hEwJH44dVdWFVLaMT6Ibsv0cfqaoHq+p64Drgkqq6taoeAL5JJzx3+/vmeuYBXwde18wgHwgcU1UPVdXPgY8CB3edd2dVnVxVS6rqscEKqarLqmpRVS2rqmuBL7DieH24qh6rqmuAa4Adm+1/DXygqm6sjmuq6rfAq4GfV9UZTd8/As4HDhjBGEkaQ65lkqSJYf+q+u/lb5LsCqwB3JVk+ebVgF82+zcGPgHsCcxo9t0/yhp+2fV6i+H679Fvul4/Nsj753S9v7+qHul6fzud2eqNgDWb9937Nh2i7kEl2Q04kc5M8prAM4AvDzjs112vHwXWbV5vBtwySLNbALstXx7RWB04Z2X1SBofzuhK0sT0S2AxsFFVzWx+1quq5b8W/yeggB2qaj06v7JP1/k1oL1HgOnL3zQzpc8acEz3OSvrf1XboFkKsNzmwJ3AvcATdEJl975fDVH3YO+hs7zgq8BmVbU+nXW8GeS4wfwS2GqI7fO6xmdms1zibT22K2mMGXQlaQKqqruAS4CPJlkvyWrNzVzLf90+A3gY+F2STYH3DGjiN3TWtC53E7BWc1PWGsAH6MxqPt3+x8KHk6yZZE86ywK+XFVLgS8BJySZkWQLOmtmh3uU2W+A5y2/2a0xA7ivqh5vZsvfMIK6Tgf+IcnW6dghyYbARcA2SQ5Oskbzs0vX2l5JfWbQlaSJ6xA6v2b/CZ1lCecBmzT7PgzsBDxAZz3rVwac+0/AB5o1v+9u1sW+nU5o+xWdGd47GN5w/a9qv276uJPOjXBHVNUNzb6j6dR7K/A9OrOznxmmre8A1wO/TnJvs+3twPFJHgI+SCc89+pfmuMvAR4EPg2sXVUP0blB7/VN3b8GPsIw/4CQNL5SNdhveCRJGh9J9gE+W1XP63MpklrGGV1JkiS1kkFXkiRJreTSBUmSJLWSM7qSJElqJYOuJEmSWslvRtMKNtpoo5o1a1a/y5AkSVqpBQsW3FtVA78ABzDoahCzZs1i/vz5/S5DkiRppZLcPtQ+ly5IkiSplZzR1Qp+esdv2fk9Z/e7DEmSNEktOOmQfpcAOKMrSZKkljLoSpIkqZUMupIkSWolg64kSZJayaArSZKkVjLoSpIkqZUMupIkSWolg64kSZJayaArSZKkVvKb0fooyVJgEZ2/h58Ch1bVo/2tSpIkqR2c0e2vx6pqdlVtD/weOKJ7Z5Jp/SlLkiRp8jPoThzfBV6QZJ8klyb5PLAoyVpJzkiyKMmPk+wLkOSwJBcm+VqS25IcleTvmmOuSPLM5rjZzftrk1yQZIN+XqQkSdJ4MehOAElWB/6UzjIGgF2BY6vqRcCRAFX1YmAucFaStZrjtgfe0Bx/AvBoVb0E+CFwSHPM2cD7qmqHpv0PDVHD4UnmJ5m/5NGHVvUlSpIkjTuDbn+tnWQhMB/4BfDpZvtVVXVb8/plwDkAVXUDcDuwTbPv0qp6qKruAR4AvtZsXwTMSrI+MLOq5jXbzwL2GqyQqjqtquZU1ZzVp89YZRcoSZLUL96M1l+PVdXs7g1JAB7p3jTM+Yu7Xi/rer8M/24lSdIU54zuxHc5cBBAkm2AzYEbezmxqh4A7k+yZ7PpYGDeMKdIkiS1hrN+E98pwKlJFgFLgMOqanEz89uLQ5vzpwO3Am8amzIlSZImllRVv2vQBLPOc55f2x384X6XIUmSJqkFJx2y8oNWkSQLqmrOYPtcuiBJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiW/MEIreOHzNmT+OD7/TpIkaSw4oytJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplXy8mFbw+7uu5xfHv7jfZUjjYvMPLup3CZKkMeKMriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoNslybFJrk9ybZKFSXZL8s4k01dR+1c27f4iyT3N64VJZq2K9gfpb58kF41F25IkSROdXwHcSLI78Gpgp6panGQjYE3gXOCzwKMjaGtaVS0duL2qdmv2HwbMqaqjVkXtkiRJWpEzuk/aBLi3qhYDVNW9wAHAc4FLk1wKkGRukkVJrkvykeUnJ3k4yfFJrgR2T/LGJFc1M7afSjJtYIdJVktyc5Jndb3/WZKNkpyZ5NQk301yU5JXN8dMS3JSkqubmee/Wcl1rZfkgiQ/adrz71ySJE0Jhp4nXQJs1oTKU5LsXVWfAO4E9q2qfZM8F/gIsB8wG9glyf7N+esA1zWztr8FDgT2qKrZwFLgoIEdVtUyOrPFy/e9ArimCdkAs4C9gVcBpyZZC3gL8EBV7QLsArw1yfOHua5dgf8NvBjYCvhfgx2U5PAk85PMv++RFSajJUmSJh2DbqOqHgZ2Bg4H7gHObZYYdNsFuKyq7qmqJcDngL2afUuB85vXL2/aujrJwub9lkN0/RngkOb1m4EzuvZ9qaqWVdXNwK3AdsArgUOadq8ENgS2HubSrqqqW5ulFF8AXjbE9Z9WVXOqas4z11lh8lmSJGnScY1ulyYMXgZclmQRcOiAQzLM6Y93rcsNcFZVHdNDn79M8psk+wG78dSZ3xp4eNP20VV18craHqYNSZKk1nNGt5Fk2yTdM6OzgduBh4AZzbYrgb2bNbTTgLnAvEGa+zZwQJKNm7afmWSLYbo/nc4Shi8NuInttc263a3ozAjfCFwMvC3JGk3b2yRZZ5i2d03y/GZt7oHA94Y5VpIkqTWc0X3SusDJSWYCS4Cf0VnGMBf4ZpK7mnW6xwCX0plZ/UZV/efAhqrqJ0k+AFzSBMwngCPpBOfBfJXOkoUzBmy/kU6QfjZwRFU9nuR0Omt3f5QkdJZZ7D/Mdf0QOJHOGt3LgQuGGwRJkqS2SJW/ye63JHOAj1XVnl3bzgQuqqrzxrueHTZduy76mxeMd7dSX2z+wUX9LkGSNApJFlTVnMH2OaPbZ0neD7yNQZ7KIEmSpKfPoNtnVXUinaUFA7cf1msbSV4MnDNg8+LlX1AhSZI0FRl0W6CqFtG5eU6SJEkNn7ogSZKkVjLoSpIkqZUMupIkSWolg64kSZJayZvRtII1N/kDNv/g/H6XIUmSNCrO6EqSJKmVDLqSJElqJYOuJEmSWsmgK0mSpFYy6EqSJKmVfOqCVnDD3Tewx8l79LsMacS+f/T3+12CJGkCcUZXkiRJrWTQlSRJUisZdCVJktRKBl1JkiS1kkFXkiRJrWTQlSRJUisZdCVJktRKBl1JkiS1kkFXkiRJrWTQHQdJliZZmOS6JF9OMn0c+z4pyQ1Jrk1yQZKZ49W3JElSPxl0x8djVTW7qrYHfg8c0b0zybQx7PtbwPZVtQNwE3DMGPYlSZI0YRh0x993gRck2SfJpUk+DyxKslaSM5IsSvLjJPsCJDksyYVJvpbktiRHJfm75pgrkjyzOW528375zO0GAFV1SVUtafq+AnhePy5akiRpvBl0x1GS1YE/BRY1m3YFjq2qFwFHAlTVi4G5wFlJ1mqO2x54Q3P8CcCjVfUS4IfAIc0xZwPva2ZuFwEfGqSENwPfHKK2w5PMTzL/iYefGN2FSpIkTQAG3fGxdpKFwHzgF8Cnm+1XVdVtzeuXAecAVNUNwO3ANs2+S6vqoaq6B3gA+FqzfREwK8n6wMyqmtdsPwvYq7uAJMcCS4DPDVZgVZ1WVXOqas4a664xqouVJEmaCFbvdwFTxGNVNbt7QxKAR7o3DXP+4q7Xy7reL6OHv8MkhwKvBl5eVdVDvZIkSZOeM7oTx+XAQQBJtgE2B27s5cSqegC4P8mezaaDgXlNW38CvA/4i6p6dFUXLUmSNFE5oztxnAKcmmQRnSUGh1XV4mbmtxeHNudPB24F3tRs/yTwDOBbTVtXVNURgzchSZLUHvE32Rpo3c3XrR3fs2O/y5BG7PtHf7/fJUiSxlmSBVU1Z7B9Ll2QJElSKxl0JUmS1EoGXUmSJLWSQVeSJEmtZNCVJElSKxl0JUmS1EoGXUmSJLWSXxihFWy38XY+j1SSJE16zuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiUfL6YVPHTjjczba+9+l6Ee7H35vH6XIEnShOWMriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklppSgXdJH+ZpJJsN4Z97J/kRWPV/nj3I0mSNFlNqaALzAW+B7x+LBpPsjqwPzAeAXS8+pEkSZqUpkzQTbIusAfwFpqgm2STJJcnWZjkuiR7JpmW5Mzm/aIk72qOnZ3kiiTXJrkgyQbN9suS/N8k84D3AX8BnNS0uVWz/2NNPz9NskuSryS5Ock/dtX3xiRXNed9Ksm0ZvvDSU5Ick3T/7OTvHRgP0Nc81uTXN2ce36S6WM3wpIkSRPLlAm6dGZA/6uqbgLuS7IT8Abg4qqaDewILARmA5tW1fZV9WLgjOb8s4H3VdUOwCLgQ11tz6yqvavqBOCrwHuqanZV3dLs/31V7QWcCvwncCSwPXBYkg2TvBA4ENijqWUpcFBz7jrAFVW1I3A58Naq+sEQ/Qz0larapTn3p3RC/qCSHJ5kfpL5DzzxxHDjKEmSNCms3u8CxtFc4F+b119s3n8N+EySNYALq2phkluBLZOcDHwduCTJ+nTC7Lzm/LOAL3e1fe5K+v5q8+ci4Pqqugug6Wsz4GXAzsDVSQDWBu5uzvk9cFHzegHwRyO45u2bWeOZwLrAxUMdWFWnAacBbDtjRo2gD0mSpAlpSgTdJBsC+9EJfgVMAwp4L7AX8CrgnCQnVdXZSXYE/pjOzOvrgHetpItHVrJ/cfPnsq7Xy9+vDgQ4q6qOGeTcJ6pqefBcysj+zs4E9q+qa5IcBuwzgnMlSZImtamydOEA4Oyq2qKqZlXVZsBtdELu3VX1H8CngZ2SbASsVlXnA38P7FRVDwD3J9mzae9gYN6K3QDwEDBjhPV9GzggycYASZ6ZZIuVnNNLPzOAu5oZ64NWcqwkSVKrTIkZXTrLFE4csO18OjOejyR5AngYOATYFDgjyfJ/BCyfZT0UOLW5oetW4E1D9PVF4D+SvINOwF6pqvpJkg/QWSaxGvAEndnk24c57Sn9DLFO9++BK5t2FjHyAC5JkjRp5cnfiksd286YUae9ZKd+l6Ee7H35UL9YkCRpakiyoKrmDLZvqixdkCRJ0hQzVZYutFqSf6PzjOBuH6+qMwY7XpIkaSow6LZAVR3Z7xokSZImGpcuSJIkqZUMupIkSWolg64kSZJayaArSZKkVvJmNK1gxrbb+nxWSZI06TmjK0mSpFYy6EqSJKmVDLqSJElqJYOuJEmSWsmgK0mSpFbyqQtawd13PMAn//fX+l3GlHPUR/+83yVIktQqzuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplcY06CZ5TpIvJrklyU+SfCPJNmPZ5yA1zEryhq73c5J8onm9T5KXdu07IskhT7OffZI8kOTHSW5McnmSV/fa9sBaJEmSNDpj9hXASQJcAJxVVa9vts0Gng3cNFb9DmIW8Abg8wBVNR+Y3+zbB3gY+EGz79RR9vXdqno1/M+1Xpjksar6dg9tP6UWSZIkjc5YzujuCzzRHfCqaiHwvSQnJbkuyaIkB8L/zGjOS/KlJDclOTHJQUmuao7bqjnuzCSnJvluc9zyYDmtaffqJNcm+Zum2xOBPZMsTPKupp+LkswCjgDe1ezbM8lxSd7dtDc7yRVNWxck2aDZflmSjzR13ZRkz8EuvrnW44GjmvO6235HM8N9bTPjPVgtf57kymaG+L+TPLurnc80ddya5B3L+0xySNPmNUnOabY9K8n5zbhcnWSP0f7FSpIkTQZjNqMLbA8sGGT7/wJmAzsCGwFXJ7m82bcj8ELgPuBW4PSq2jXJ3wJHA+9sjpsF7A1sBVya5AXAIcADVbVLkmcA309yCfB+4N1dM637AFTVz5OcCjxcVf+v2ffyrjrPBo6uqnlJjgc+1NX/6k1df9Zsf8UQY/Aj4D2DbH8/8PyqWpxkZlX9bpBaNgD+sKoqyV8D7wX+d3P+dnT+ITEDuDHJvwPbAMcCe1TVvUme2Rz7ceBjVfW9JJsDFzdj/BRJDgcOB9hgxrOGuBxJkqTJY6VBt1mCcBCwZVUd34Sl51TVVU+zz5cBX6iqpcBvkswDdgEeBK6uqruafm8BLmnOWUQn2C33papaBtyc5FY6we+VwA5JDmiOWR/YGvj9SAtMsj4ws6rmNZvOAr7cdchXmj8X0AndQzY1xPZrgc8luRC4cIhjngecm2QTYE3gtq59X6+qxcDiJHfTWQ6yH3BeVd0LUFX3Nce+AnhR568RgPWSzKiqh7o7q6rTgNMANn/O1jXMNUmSJE0KvSxdOAXYHZjbvH8I+Lcezrse2HmQ7UOFP4DFXa+Xdb1fxlND+cAgVk27R1fV7Obn+VV1CWNjeV1LGf4fCy8BfjrI9lfRGcOdgQVJBmvjZOCTVfVi4G+AtQbpv7uGsOK4QOfvePeucdl0YMiVJElqo16C7m5VdSTwOEBV3U9nhnFlvgM8I8lbl29IsgtwP3Bgs6b2WcBewEhnh1+bZLVm3e6WwI10fiX/tiRrNH1tk2QdOsF8xhDtDLqvqh4A7u9af3swMG/gccNJsgPw9wz4R0GS1YDNqupSOssRZgLrDlLL+sCvmteH9tDlt4HXJdmw6Wf50oVLaNYJN9tnj+Q6JEmSJqte1ug+kWQazWxhE06XreykZm3pXwL/muT9dILyz+msc10XuKZp871V9esk242g7hvpBM9nA0dU1eNJTqezjOBHzXKLe4D96SwTWJLkGuBM4Mdd7XwNOC/Ja+isAe52KHBqkul01gu/qYe69kzyY2A6cDfwjqr69oBjpgGfbZZHhM762d8lGVjLccCXk/wKuAJ4/nAdV9X1SU4A5iVZ2lznYcA7gH9Lci2dv+/L6dz4JkmS1GqpGn45ZpKDgAOBneisVT0A+EBVfXnYE8dIkjOBi6rqvH70PxVs/pyt670H/Uu/y5hyjvron/e7BEmSJp0kC6pqzmD7hp3RbX7NfhudX7G/nM4M5P5VNdi6U0mSJGnCGDboVtWyJB+tqt2BG8appmFV1WH9rkGSJEkTXy83o12S5K/S9XwqSZIkaaLr5Wa0vwPWoXND1+M0j7GqqvXGtDJJkiRpFFYadKtqqEdzSZIkSRNWL9+Mttdg26vq8sG2S5IkSRNBL0sX3tP1ei1gVzpffbvfmFQkSZIkrQK9LF14ysM9k2wG/POYVaS+2/h56/tMV0mSNOn18tSFge4Atl/VhUiSJEmrUi9rdE+m+fpfOsF4Np2v75UkSZImrF7W6M7ver0E+EJVfX+M6pEkSZJWiV6C7syq+nj3hiR/O3CbJEmSNJH0skb30EG2HbaK65AkSZJWqSFndJPMBd4APD/JV7t2zQB+O9aFSZIkSaMx3NKFHwB3ARsBH+3a/hBw7VgWpf6667ZbOOGNB/S7jCnh2M+e1+8SJElqrSGDblXdDtwO7D5+5UiSJEmrxkrX6Cb5wyRXJ3k4ye+TLE3y4HgUJ0mSJD1dvdyM9klgLnAzsDbw18DJY1mUJEmSNFq9PF6MqvpZkmlVtRQ4I8kPxrguSZIkaVR6CbqPJlkTWJjkn+ncoLbO2JYlSZIkjU4vSxcObo47CngE2Az4q7EsSpIkSRqtlc7oVtXtSdYGNqmqD49DTZIkSdKo9fLUhT8HFgL/1byfPeALJCRJkqQJp5elC8cBuwK/A6iqhcCssSpIkiRJWhV6CbpLquqBMa9knCT5yySVZLsx7GP/JC8aq/bHux9JkqTJqJege12SNwDTkmyd5GQ6Xw88Wc0Fvge8fiwaT7I6sD8wHgF0vPqRJEmadIYMuknOaV7eAvwBsBj4AvAg8M4xr2wMJFkX2AN4C03QTbJJksuTLExyXZI9k0xLcmbzflGSdzXHzk5yRZJrk1yQZINm+2VJ/m+SecD7gL8ATmra3KrZ/7Gmn58m2SXJV5LcnOQfu+p7Y5KrmvM+lWRas/3hJCckuabp/9lJXjqwnyGu+QVJ/rs590dDHSdJktQ2wz11YeckWwAHAvsCH+3aNx14fCwLGyP7A/9VVTcluS/JTnSu7eKqOqEJltOB2cCmVbU9QJKZzflnA0dX1bwkxwMf4snQP7Oq9m6O3xq4qKrOa94D/L6q9kryt8B/AjsD9wG3JPkYsDGdsd6jqp5IcgpwUNPnOsAVVXVs8yzjt1bVPzY3Bf5PP0P4HHBiVV2QZC2G+MdNksOBwwHWn752T4MpSZI0kQ0XdE+l86SFLYH5XdsDVLN9spkL/Gvz+ovN+68Bn0myBnBhVS1MciuwZbNM4+vAJUnWpxNm5zXnnwV8uavtc1fS9/InVSwCrq+quwCavjYDXkYn/F7dBOO1gbubc34PXNS8XgD8US8Xm2QGncB+AUBVDfmPk6o6DTgNYNMNN6he2pckSZrIhgy6VfUJ4BNJ/r2q3jaONY2JJBsC+wHbJylgGp3A/l5gL+BVwDlJTqqqs5PsCPwxcCTwOuBdK+nikZXsX9z8uazr9fL3q9P5B8RZVXXMIOc+UVXLw+dSevzq5qZNSZKkKWmlN6O1IeQ2DgDOrqotqmpWVW0G3EYn5N5dVf8BfBrYKclGwGpVdT7w98BOzZMn7k+yZ9PewcC8FbsB4CFgxgjr+zZwQJKNAZI8s1k6Mpxh+6mqB4E7kuzftPmMJNNHWJckSdKk1MtTF9piLnDBgG3nA2cCC5P8mM5XG38c2BS4LMnCZv/yWdZD6dz8dS2ddbzHD9HXF4H3JPlxrzd/VdVPgA/QWSZxLfAtYJOVnNZLPwcD72ja/AHwnF7qkSRJmuzy5G/EpY5NN9yg3v6nL+93GVPCsZ8d7j5CSZK0MkkWVNWcwfZNpRldSZIkTSG93tSkCS7Jv9F5RnC3j1fVGf2oR5Ikqd8Mui1RVUf2uwZJkqSJxKULkiRJaiWDriRJklrJoCtJkqRWMuhKkiSplbwZTSvY5Plb+XxXSZI06TmjK0mSpFYy6EqSJKmVDLqSJElqJYOuJEmSWsmgK0mSpFYy6EqSJKmVfLyYVvD4XQ/x0xO+0+8yJr0XHrtfv0uQJGlKc0ZXkiRJrWTQlSRJUisZdCVJktRKBl1JkiS1kkFXkiRJrWTQlSRJUisZdCVJktRKBl1JkiS1kkF3AkpybJLrk1ybZGGS3ZK8M8n0ftcmSZI0WfjNaBNMkt2BVwM7VdXiJBsBawLnAp8FHh1BW9OqaunYVCpJkjSxOaM78WwC3FtViwGq6l7gAOC5wKVJLgVIMjfJoiTXJfnI8pOTPJzk+CRXArsneWOSq5qZ4U8lmdaHa5IkSRp3Bt2J5xJgsyQ3JTklyd5V9QngTmDfqto3yXOBjwD7AbOBXZLs35y/DnBdVe0G/BY4ENijqmYDS4GDxvVqJEmS+sSlCxNMVT2cZGdgT2Bf4Nwk7x9w2C7AZVV1D0CSzwF7ARfSCbPnN8e9HNgZuDoJwNrA3YP1m+Rw4HCATdbfeBVekSRJUn8YdCegZl3tZcBlSRYBhw44JMOc/njXutwAZ1XVMT30eRpwGsD2m25bIy5akiRpgnHpwgSTZNskW3dtmg3cDjwEzGi2XQnsnWSjZs3tXGDeIM19GzggycZN289MssWYFS9JkjSBOKM78awLnJxkJrAE+BmdJQVzgW8muatZp3sMcCmdWdtvVNV/Dmyoqn6S5APAJUlWA54AjqQTnCVJklrNoDvBVNUC4KWD7Dq5+Vl+3OeBzw9y/roD3p9L59FkkiRJU4pLFyRJktRKBl1JkiS1kkFXkiRJrWTQlSRJUisZdCVJktRKBl1JkiS1kkFXkiRJrWTQlSRJUiv5hRFawVqbzOCFx+7X7zIkSZJGxRldSZIktZJBV5IkSa1k0JUkSVIrGXQlSZLUSgZdSZIktZJPXdAK7rzzTo477rh+lzFhOTaSJE0OzuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplfxmtEkkyVJgEbAGsAQ4C/jXqlrW18IkSZImIIPu5PJYVc0GSLIx8HlgfeBD/SxKkiRpInLpwiRVVXcDhwNHpWNWku8m+VHz81KAJOckec3y85J8Lslf9KtuSZKk8WLQncSq6lY6f4cbA3cDf1RVOwEHAp9oDjsdeBNAkvWBlwLfGNhWksOTzE8y/9FHHx2P8iVJksaUQXfyS/PnGsB/JFkEfBl4EUBVzQNe0Cx1mAucX1VLBjZSVadV1ZyqmjN9+vRxKl2SJGnsuEZ3EkuyJbCUzmzuh4DfADvS+QfM412HngMcBLweePM4lylJktQXBt1JKsmzgFOBT1ZVNcsS7qiqZUkOBaZ1HX4mcBXw66q6fvyrlSRJGn8G3cll7SQLefLxYucA/9LsOwU4P8lrgUuBR5afVFW/SfJT4MJxrVaSJKmPDLqTSFVNG2bfzcAOXZuOWf4iyXRga+ALY1edJEnSxOLNaC2X5BXADcDJVfVAv+uRJEkaL87otlxV/Teweb/rkCRJGm/O6EqSJKmVDLqSJElqJYOuJEmSWsmgK0mSpFYy6EqSJKmVUlX9rkETzJw5c2r+/Pn9LkOSJGmlkiyoqjmD7XNGV5IkSa1k0JUkSVIrGXQlSZLUSgZdSZIktZJBV5IkSa1k0JUkSVIrrd7vAjTx3H//T/nSl3ftdxkTwutee1W/S5AkSU+TM7qSJElqJYOuJEmSWsmgK0mSpFYy6EqSJKmVDLqSJElqJYOuJEmSWsmgK0mSpFYy6EqSJKmVDLp9lqSSnNP1fvUk9yS56Gm2NzPJ27ve7/N025IkSZrMDLr99wiwfZK1m/d/BPxqFO3NBN6+soMkSZLazqA7MXwTeFXzei7wheU7kjwzyYVJrk1yRZIdmu3HJflMksuS3JrkHc0pJwJbJVmY5KRm27pJzktyQ5LPJcl4XZgkSVK/GHQnhi8Cr0+yFrADcGXXvg8DP66qHYD/A5zdtW874I+BXYEPJVkDeD9wS1XNrqr3NMe9BHgn8CJgS2CPMbwWSZKkCcGgOwFU1bXALDqzud8YsPtlwDnNcd8BNkyyfrPv61W1uKruBe4Gnj1EF1dV1R1VtQxY2PT1FEkOTzI/yfwHH1wyyiuSJEnqP4PuxPFV4P/RtWyhMdgyg2r+XNy1bSmw+hBtr/S4qjqtquZU1Zz11huqGUmSpMnDoDtxfAY4vqoWDdh+OXAQdJ6gANxbVQ8O085DwIyxKFCSJGkycepugqiqO4CPD7LrOOCMJNcCjwKHrqSd3yb5fpLr6Nzk9vVVXaskSdJkYNDts6pad5BtlwGXNa/vA14zyDHHDXi/fdfrNww4/LKufUeNolxJkqRJw6ULkiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplfzCCK1ggw1eyOtee1W/y5AkSRoVZ3QlSZLUSgZdSZIktZJBV5IkSa1k0JUkSVIrGXQlSZLUSj51QSv4yf0PsuN5F/e7jL675oA/7ncJkiRpFJzRlSRJUisZdCVJktRKBl1JkiS1kkFXkiRJrWTQlSRJUisZdCVJktRKBl1JkiS1kkFXkiRJrWTQlSRJUisZdCVJktRKBt0uSY5Ncn2Sa5MsTLJbkncmmb6K2r+yafcXSe5pXi9MMmtVtN9D/2cmOWA8+pIkSeq31ftdwESRZHfg1cBOVbU4yUbAmsC5wGeBR0fQ1rSqWjpwe1Xt1uw/DJhTVUetitolSZK0Imd0n7QJcG9VLQaoqnuBA4DnApcmuRQgydwki5Jcl+Qjy09O8nCS45NcCeye5I1JrmpmbD+VZNrADpOsluTmJM/qev+zJBs1s6+nJvlukpuSvLo5ZlqSk5Jc3cw8/81QF5SOTyb5SZKvAxsPc+zhSeYnmb/kwQeezvhJkiRNKAbdJ10CbNaEylOS7F1VnwDuBPatqn2TPBf4CLAfMBvYJcn+zfnrANc1s7a/BQ4E9qiq2cBS4KCBHVbVMjqzxcv3vQK4pgnZALOAvYFXAacmWQt4C/BAVe0C7AK8Ncnzh7imvwS2BV4MvBV46VAXX1WnVdWcqpqz+nrrDzNMkiRJk4NBt1FVDwM7A4cD9wDnNksMuu0CXFZV91TVEuBzwF7NvqXA+c3rlzdtXZ1kYfN+yyG6/gxwSPP6zcAZXfu+VFXLqupm4FZgO+CVwCFNu1cCGwJbD9H2XsAXqmppVd0JfGfIAZAkSWoZ1+h2adbVXgZclmQRcOiAQzLM6Y93rcsNcFZVHdNDn79M8psk+wG78dSZ3xp4eNP20VV18craHqINSZKkKcEZ3UaSbZN0z4zOBm4HHgJmNNuuBPZu1tBOA+YC8wZp7tvAAUk2btp+ZpIthun+dDpLGL404Ca21zbrdreiMyN8I3Ax8LYkazRtb5NknSHavRx4fbOudxNg32FqkCRJahVndJ+0LnBykpnAEuBndJYxzAW+meSuZp3uMcCldGZWv1FV/zmwoar6SZIPAJckWQ14AjiSTnAezFfpLFk4Y8D2G+kE6WcDR1TV40lOp7N290dJQmeZxf5DtHsBnfXEi4CbGDyUS5IktVKq/M12vyWZA3ysqvbs2nYmcFFVnTfe9Uzfapva+iMnj3e3E841B/xxv0uQJEkrkWRBVc0ZbJ8zun2W5P3A2xjkqQySJEl6+gy6fVZVJwInDrL9sF7bSPJi4JwBmxcv/4IKSZKkqcig2wJVtYjOzXOSJElq+NQFSZIktZJBV5IkSa1k0JUkSVIrGXQlSZLUSt6MphW8aIP1mO8zZCVJ0iTnjK4kSZJayaArSZKkVvIrgLWCJA8BN/a7jiliI+DefhcxRTjW48exHj+O9fhxrMfPSMd6i6p61mA7XKOrwdw41HdGa9VKMt+xHh+O9fhxrMePYz1+HOvxsyrH2qULkiRJaiWDriRJklrJoKvBnNbvAqYQx3r8ONbjx7EeP471+HGsx88qG2tvRpMkSVIrOaMrSZKkVjLoTiFJ/iTJjUl+luT9g+xPkk80+69NslOv5+qpRjnWP0+yKMnCJPPHt/LJp4ex3i7JD5MsTvLukZyrpxrlWPu5HoEexvqg5r8d1yb5QZIdez1XTzXKsfZzPQI9jPVrmnFemGR+kpf1eu6QqsqfKfADTANuAbYE1gSuAV404Jg/A74JBPhD4Mpez/Vn1Yx1s+/nwEb9vo7J8NPjWG8M7AKcALx7JOf6s2rGutnn53rVjvVLgQ2a13/qf6/Hf6yb936uV+1Yr8uTy2p3AG7o9dyhfpzRnTp2BX5WVbdW1e+BLwKvGXDMa4Czq+MKYGaSTXo8V08azVhrZFY61lV1d1VdDTwx0nP1FKMZa41ML2P9g6q6v3l7BfC8Xs/VU4xmrDUyvYz1w9UkW2AdoHo9dygG3aljU+CXXe/vaLb1ckwv5+pJoxlr6PwP+5IkC5IcPmZVtsNoPpt+rkdmtOPl57p3Ix3rt9D5DdHTOXeqG81Yg5/rkehprJP8ZZIbgK8Dbx7JuYPxm9GmjgyybeAjN4Y6ppdz9aTRjDXAHlV1Z5KNgW8luaGqLl+lFbbHaD6bfq5HZrTj5ee6dz2PdZJ96YSv5WsZ/VyPzGjGGvxcj0RPY11VFwAXJNkL+AfgFb2eOxhndKeOO4DNut4/D7izx2N6OVdPGs1YU1XL/7wbuIDOr2w0uNF8Nv1cj8yoxsvP9Yj0NNZJdgBOB15TVb8dybn6H6MZaz/XIzOiz2bzD4atkmw00nO7GXSnjquBrZM8P8mawOuBrw445qvAIc0TAf4QeKCq7urxXD3paY91knWSzABIsg7wSuC68Sx+khnNZ9PP9cg87fHycz1iKx3rJJsDXwEOrqqbRnKunuJpj7Wf6xHrZaxfkCTN653o3Hj2217OHYpLF6aIqlqS5CjgYjp3L36mqq5PckSz/1TgG3SeBvAz4FHgTcOd24fLmBRGM9bAs+n8ygY6//v8fFX91zhfwqTRy1gneQ4wH1gPWJbknXTu1n3Qz3XvRjPWwEb4ue5Zj/8N+SCwIXBKM65LqmqO/70emdGMNf73ekR6HOu/ojMJ9ATwGHBgc3Pa0/5c+81okiRJaiWXLkiSJKmVDLqSJElqJYOuJEmSWsmgK0mSpFYy6EqSJKmVDLqSNEUl+cE49zcryRvGs09JU5tBV5KmqKp66Xj1lWR1YBZg0JU0bnyOriRNUUkerqp1k+wDfBj4DTCbzrdALQL+Flgb2L+qbklyJvA48Ad0Hpb/d1V1UZK1gH8H5gBLmu2XJjkMeBWwFrAOMB14IXAbcBadr0w9p9kHcFRV/aCp5zjgXmB7YAHwxqqqJLsAH2/OWQy8nM6XrpwI7AM8A/i3qvrUqhwrSZOT34wmSQLYkU4IvQ+4FTi9qnZN8rfA0cA7m+NmAXsDWwGXJnkBcCRAVb04yXbAJUm2aY7fHdihqu5rAuy7q+rVAEmmA39UVY8n2Rr4Ap2wDPASOoH6TuD7wB5JrgLOpfNtSVcnWY/Otye9hc7XaO+S5BnA95NcUlW3rfJRkjSpGHQlSQBXV9VdAEluAS5pti8C9u067ktVtQy4OcmtwHbAy4CTAarqhiS3A8uD7req6r4h+lwD+GSS2cDSrnMArqqqO5p6FtIJ2A8Ad1XV1U1fDzb7XwnskOSA5tz1ga3pzBxLmsIMupIk6CwDWG5Z1/tlPPX/Kwaudysgw7T7yDD73kVnucSOdO4ZeXyIepY2NWSQ/mm2H11VFw/Tl6QpyJvRJEkj8dokqyXZCtgSuBG4HDgIoFmysHmzfaCHgBld79enM0O7DDgYmLaSvm8Antus0yXJjOYmt4uBtyVZY3kNSdYZph1JU4QzupKkkbgRmEfnZrQjmvW1pwCnJllE52a0w6pqcbLCRO+1wJIk1wBnAqcA5yd5LXApw8/+UlW/T3IgcHKStemsz30FcDqdpQ0/SqfTe4D9V8G1SprkfOqCJKknzVMXLqqq8/pdiyT1wqULkiRJaiVndCVJktRKzuhKkiSplQy6kiRJaiWDriRJklrJoCtJkqRWMuhKkiSplQy6kiRJaqX/D7SsoacIrlsQAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 720x432 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import seaborn as sns\n",
    "plt.figure(figsize=(10,6))\n",
    "plt.title('Feature Importance')\n",
    "sns.barplot(data=importance_df.head(10), x='importance', y='feature');"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "92d5fd89",
   "metadata": {},
   "source": [
    "- k折叠交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "75ce47fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):\n",
    "    model = XGBRegressor(random_state=42, n_jobs=-1, **params)\n",
    "    model.fit(X_train, train_targets)\n",
    "    train_rmse = rmse(model.predict(X_train), train_targets)\n",
    "    val_rmse = rmse(model.predict(X_val), val_targets)\n",
    "    return model, train_rmse, val_rmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "1f52ed5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "kfold = KFold(n_splits=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "ec0e5498",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 2414.7936312858537, Validation RMSE: 2467.171620065982\n",
      "Train RMSE: 2404.0534016734114, Validation RMSE: 2441.6522708946354\n",
      "Train RMSE: 2422.521349581155, Validation RMSE: 2403.6346008232986\n",
      "Train RMSE: 2356.396019856885, Validation RMSE: 2465.678539017347\n",
      "Train RMSE: 2413.4193964653396, Validation RMSE: 2479.248192000345\n"
     ]
    }
   ],
   "source": [
    "models = []\n",
    "\n",
    "for train_idxs, val_idxs in kfold.split(X):\n",
    "    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]\n",
    "    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]\n",
    "    model, train_rmse, val_rmse = train_and_evaluate(X_train, \n",
    "                                                     train_targets, \n",
    "                                                     X_val, \n",
    "                                                     val_targets, \n",
    "                                                     max_depth=4, \n",
    "                                                     n_estimators=20)\n",
    "    models.append(model)\n",
    "    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "2cc93638",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_avg(models, inputs):\n",
    "    return np.mean([model.predict(inputs) for model in models], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "0114f05a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([8144.918 , 7517.7554, 8913.873 , ..., 8067.2563, 9229.947 ,\n",
       "       8738.672 ], dtype=float32)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = predict_avg(models, X_train)\n",
    "preds"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf04b622",
   "metadata": {},
   "source": [
    "- 超参数调整和正则化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "3f779422",
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_params_kfold(n_splits, **params):\n",
    "    train_rmses, val_rmses, models = [], [], []\n",
    "    kfold = KFold(n_splits)\n",
    "    for train_idxs, val_idxs in kfold.split(X):\n",
    "        X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]\n",
    "        X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]\n",
    "        model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)\n",
    "        models.append(model)\n",
    "        train_rmses.append(train_rmse)\n",
    "        val_rmses.append(val_rmse)\n",
    "    print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses), np.mean(val_rmses)))\n",
    "    return models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "7852af7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_val, train_targets, val_targets = train_test_split(X, targets, test_size=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "0ed1f6b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_params(**params):\n",
    "    model = XGBRegressor(n_jobs=-1, random_state=42, **params)\n",
    "    model.fit(X_train, train_targets)\n",
    "    train_rmse = rmse(model.predict(X_train), train_targets)\n",
    "    val_rmse = rmse(model.predict(X_val), val_targets)\n",
    "    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "55c075e0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 2412.206862830845, Validation RMSE: 2428.2967201925658\n"
     ]
    }
   ],
   "source": [
    "test_params(n_estimators=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "478cc32d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 2416.2372927698643, Validation RMSE: 2429.450532771219\n"
     ]
    }
   ],
   "source": [
    "test_params(max_depth=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "ff6c962b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 1503.4743795063553, Validation RMSE: 1518.1417137048907\n"
     ]
    }
   ],
   "source": [
    "test_params(max_depth=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "8dba4986",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 5050.9249946985965, Validation RMSE: 5056.824977594182\n"
     ]
    }
   ],
   "source": [
    "test_params(n_estimators=50, learning_rate=0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "9ec6ddaa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 2203.568029826842, Validation RMSE: 2222.8839609391002\n"
     ]
    }
   ],
   "source": [
    "test_params(n_estimators=50, learning_rate=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "52361f64-bcf7-4108-b0cd-616bafded160",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_params(booster='gblinear')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "d2bbdff1-7306-4606-bfc8-937ba4264b0e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE: 2369.130317834184, Validation RMSE: 3369.7144918390945\n"
     ]
    }
   ],
   "source": [
    "test_params(n_estimators=1000, learning_rate=0.2, max_depth=10, subsample=0.9, colsample_bytree=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "a5d4dddc",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1000, \n",
    "                     learning_rate=0.2, max_depth=10, subsample=0.9, \n",
    "                     colsample_bytree=0.7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "ebf2ffeb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,\n",
       "             importance_type='gain', interaction_constraints='',\n",
       "             learning_rate=0.2, max_delta_step=0, max_depth=10,\n",
       "             min_child_weight=1, missing=nan, monotone_constraints='()',\n",
       "             n_estimators=1000, n_jobs=-1, num_parallel_tree=1, random_state=42,\n",
       "             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,\n",
       "             tree_method='exact', validate_parameters=1, verbosity=None)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X, targets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "96cd7d12",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_preds = model.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "348b44d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df['Sales']  = test_preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "efd39e27",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.Open.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "f9784491",
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df['Sales'] = submission_df['Sales'] * test_df.Open.fillna(1.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "1ad09467",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Sales</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>4341.630859</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>8693.640625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>9301.455078</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>6786.824219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>6893.714844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41083</th>\n",
       "      <td>41084</td>\n",
       "      <td>3420.328369</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41084</th>\n",
       "      <td>41085</td>\n",
       "      <td>7441.502441</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41085</th>\n",
       "      <td>41086</td>\n",
       "      <td>7421.017090</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41086</th>\n",
       "      <td>41087</td>\n",
       "      <td>23765.673828</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41087</th>\n",
       "      <td>41088</td>\n",
       "      <td>7068.877930</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41088 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          Id         Sales\n",
       "0          1   4341.630859\n",
       "1          2   8693.640625\n",
       "2          3   9301.455078\n",
       "3          4   6786.824219\n",
       "4          5   6893.714844\n",
       "...      ...           ...\n",
       "41083  41084   3420.328369\n",
       "41084  41085   7441.502441\n",
       "41085  41086   7421.017090\n",
       "41086  41087  23765.673828\n",
       "41087  41088   7068.877930\n",
       "\n",
       "[41088 rows x 2 columns]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5165686e-4615-4782-863f-31cc1896e6e5",
   "metadata": {},
   "source": [
    "## 8.神经网络集成学习"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "13263c59-0346-4122-a2b2-9de95ddd2af6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.nn import functional as F\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "from torchensemble import GradientBoostingRegressor,BaggingRegressor,FusionRegressor,VotingRegressor,SnapshotEnsembleRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "3f3cdb14-6491-4488-b721-0e682828751b",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MLP(nn.Module):\n",
    "\n",
    "    def __init__(self):\n",
    "        super(MLP, self).__init__()\n",
    "        self.linear1 = nn.Linear(23, 128)\n",
    "        self.linear2 = nn.Linear(128, 128)\n",
    "        self.linear3 = nn.Linear(128, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = x.view(x.size()[0], -1)\n",
    "        x = F.relu(self.linear1(x))\n",
    "        x = F.relu(self.linear2(x))\n",
    "        x = self.linear3(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "d615ed0b-d226-4687-8d17-753a121afb2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "class ensemble_model():\n",
    "    def __init__(self,ensemble,network,estimators=10,lr=1e-3,weight_decay=5e-4,epoch=50,use_cuda=False):\n",
    "        super(ensemble_model, self).__init__()\n",
    "        self.epoch=epoch\n",
    "        self.model=ensemble(\n",
    "                        estimator=network,\n",
    "                        n_estimators=10,cuda=use_cuda)\n",
    "        #设置优化器optimizer\n",
    "        self.model.set_optimizer(\"Adam\", lr=lr, weight_decay=weight_decay)\n",
    "    def fit(self,X_train,Y_train):\n",
    "        X_train=torch.FloatTensor(np.array(X_train))\n",
    "        Y_train=torch.FloatTensor(np.array(Y_train)).reshape(-1,1)\n",
    "        # Tensor -> Data loader\n",
    "        train_data = TensorDataset(X_train, Y_train)\n",
    "        train_loader = DataLoader(train_data, batch_size=100, shuffle=True)  \n",
    "        self.model.fit(train_loader, epochs=self.epoch)\n",
    "    def predict(self,X_test):\n",
    "        X_test=torch.FloatTensor(np.array(X_test))\n",
    "        # 预测\n",
    "        Y_pred = self.model.predict(X_test)\n",
    "        return np.array(Y_pred)\n",
    "    def score(self,X_test,Y_test):\n",
    "        X_test = torch.FloatTensor(np.array(X_test))\n",
    "        Y_test = torch.FloatTensor(np.array(Y_test)).reshape(-1, 1)\n",
    "        test_data = TensorDataset(X_test, Y_test)\n",
    "        test_loader = DataLoader(test_data, batch_size=10, shuffle=False)\n",
    "        testing_mse = self.model.evaluate(test_loader)\n",
    "        return testing_mse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "b025f577-a2d9-43e5-895d-97590a709fad",
   "metadata": {},
   "outputs": [],
   "source": [
    "mlp_model=ensemble_model(GradientBoostingRegressor,MLP,epoch=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "e70e8dea-b8c5-43e7-8b12-f480b6b366d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(844392, 23)"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "c87fb292-918b-499a-8ad3-2f73389e2d02",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(844392,)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "targets.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "39e38415-3832-4f46-9599-2094c7891422",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "神经网梯度提升集成学习 , MSE:1178782.559139635,mae:775.1471195999563,train_r2:0.8879223618521294,test_r2:0.8774864421275211\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, targets, test_size = 0.1)\n",
    "mlp_model.fit(X_train,Y_train)\n",
    "Y_pred=mlp_model.predict(X_test)\n",
    "from sklearn.metrics import mean_squared_error\n",
    "mse = mean_squared_error(Y_test, Y_pred)\n",
    "# Get Mean Absolute Error (MAE)\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "mae = mean_absolute_error(Y_test, Y_pred)\n",
    "from  sklearn.metrics import r2_score\n",
    "train_r2=r2_score( Y_train,mlp_model.predict(X_train))\n",
    "test_r2=r2_score( Y_test,Y_pred)\n",
    "print(f\"神经网梯度提升集成学习 , MSE:{mse},mae:{mae},train_r2:{train_r2},test_r2:{test_r2}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "8808960b-1875-4cc5-a3f5-5c359bf1ffb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fd3e15f-f82c-4a4e-976b-b6cf59a7271e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "pytorch"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
